Why do we need to improve the traing method?

In the previous note, we managed to get the neural net to

converge to any value at a given input
emulate a step function.

However, the training failed to emulate functions such as absolute value and sine.



In [1]:

    
%pylab inline
%config InlineBackend.figure_format = 'retina'









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import numpy as np
from random import random
from IPython.display import FileLink, FileLinks



In [3]:

    
def σ(z):
    return 1/(1 + np.e**(-z))
def σ_prime(z):
    return np.e**(z) / (np.e**z + 1)**2



In [4]:

    
def Plot(fn, *args, **kwargs):
    argLength = len(args);
    if argLength == 1:
        start = args[0][0]
        end = args[0][1]
        points = None
        try: 
            points = args[0][2]
        except:
            pass
        if not points: points = 30
        xs = linspace(start, end, points);
        plot(xs, list(map(fn, xs)), **kwargs);



In [5]:

    
Plot(σ, [-2, 2])



In [6]:

    
y = lambda neuron, input: neuron[0] * input + neuron[1]
α = lambda neuron, input: σ(y(neuron, input))
partial_w = lambda neuron, input: \
    σ_prime(y(neuron, input)) * input
partial_y = lambda neuron, input: \
    σ_prime(y(neuron, input))



In [7]:

    
class Neuron():
    def __init__(self, neuron):
        self.neuron = neuron
        
    def output(self, input):
        return α(self.neuron, input)
    
    def set_η(self, η):
        self.η = η
        
    def train(self, input, target, η=None):
        result = self.output(input);
        δ = result - target
        p_w = partial_w(self.neuron, input)
        p_y = partial_y(self.neuron, input)
        
        gradient = np.array([p_w, p_y])#/np.sqrt(p_w**2 + p_y**2)
        if η is None:
            η = self.η
        self.neuron = - η * δ * gradient + self.neuron;
        return result



In [71]:

    
class Network():
    def __init__(self, shape, parameters=None):
        self.shape = shape;
        self.zs = {};
        self.αs = {};
        
        self.weights = {};
        self.biases = {};
        
        self.δs = {};
        self.partial_ws = {};
        
        if parameters is not None:
            weights, biases = parameters;
            self.weights = weights;
            self.biases = biases;
        else:
            for i in range(1, len(shape)):
                self.create_network(i, shape[i])
        
    def create_network(self, ind, size):
        if ind is 0: return;
        self.weights[ind] = np.random.random(self.shape[ind-1:ind+1][::-1]) - 0.5
        self.biases[ind] = np.random.random(self.shape[ind]) - 0.5
        
    def get_partials_placeholder(self):
        partial_ws = {};
        δs = {};
        
        for ind in range(1, len(self.shape)):
            partial_ws[ind] = np.zeros(self.shape[ind-1:ind+1][::-1])
            δs[ind] = np.zeros(self.shape[ind])
        
        return partial_ws, δs;
        
    def output(self, input=None):
        if input is not None:
            self.forward_pass(input);
        return self.αs[len(self.shape) - 1]
    
    def set_η(self, η=None):
        if η is None: return
        self.η = η
        
    def train(self, input, target, η=None):
        if η is None:
            η = self.η        
        self.forward_pass(input)
        self.back_propagation(target)
        self.gradient_descent(η)
        
    # done: generate a mini batch of training data, 
    # take an average of the gradeitn from the mini-batch
    def train_batch(self, inputs, targets, η=None):
        inputs_len = np.shape(inputs)[0]
        targets_len = np.shape(targets)[0]
        
        assert inputs_len == targets_len, \
            "input and target need to have the same first dimension"
        
        N = inputs_len
        
        partial_ws, δs = self.get_partials_placeholder()
        
        # print(partial_ws, δs)
        
        for input, target in zip(inputs, targets):
            
            # print(input, target)
            self.forward_pass(input)
            self.back_propagation(target)
            
            for ind in range(1, len(self.shape)):
                partial_ws[ind] += self.partial_ws[ind] / float(N)
                δs[ind] += self.δs[ind] / float(N)

        self.partial_ws = partial_ws
        self.δs = δs
        
        self.gradient_descent(η)
        
    def forward_pass(self, input):
        # forward passing
        self.αs[0] = input;
        for i in range(1, len(self.shape)):
            self.forward_pass_layer(i);
    
    def back_propagation(self, target):
        # back-propagation
        ind_last = len(self.shape) - 1
        self.δs[ind_last] = σ_prime(self.zs[ind_last]) * \
            (self.αs[ind_last] - target);
        for i in list(range(1, len(self.shape)))[::-1]:
            self.back_propagation_layer(i)
    
    def gradient_descent(self, η):     
        # gradient descent
        for i in range(1, len(self.shape)):
            self.gradient_descent_layer(i, η)
        
    def forward_pass_layer(self, ind):
        """ind is the index of the current network"""
        self.zs[ind] = self.biases[ind] + \
            np.tensordot(self.weights[ind], self.αs[ind - 1], axes=1)
        self.αs[ind] = σ(self.zs[ind])
        
    def back_propagation_layer(self, ind):
        """ind \in [len(self.shape) - 1, 1]"""
        if ind > 1:
            self.δs[ind - 1] = σ_prime(self.zs[ind-1]) * \
                np.tensordot(self.δs[ind], self.weights[ind], axes=1)
        self.partial_ws[ind] = np.tensordot(self.δs[ind], self.αs[ind - 1], axes=0)
        
    def gradient_descent_layer(self, ind, η):
        """ind \in [1, ...len(shape) - 1]"""
        self.weights[ind] = self.weights[ind] - η * self.partial_ws[ind]
        self.biases[ind] = self.biases[ind] - η * self.δs[ind]

Now let's train the data set the way before, to validate our new class.



In [9]:

    
# train as a simple neuron
target_func = lambda x: 1 if x < 0.5 else 0
nw = Network([1, 4,  1])

figure(figsize=(16, 4))
subplot(131)

# todo: generate a mini batch of training data, 
# take an average of the gradeitn from the mini-batch

inputs = [[x] for x in np.linspace(0, 1, 100)]
targets = [[target_func(x)] for x in np.linspace(0, 1, 100)]

for ind in range(10000):
    x = np.random.random()
    nw.train([x], [target_func(x)], 10)

    scatter(x, target_func(x))

Plot(lambda x: nw.output([x])[0], [0, 1], label="neural net")
Plot(lambda x: target_func(x), [0, 1], color='r', linewidth=4, alpha=0.3, label="target function")
xlim(-0.25, 1.25)
ylim(-0.25, 1.25)
legend(loc=3, frameon=False)

subplot(132)
imshow(nw.weights[1], interpolation='none', aspect=1);colorbar();
subplot(133)
imshow(nw.weights[2], interpolation='none', aspect=1);colorbar()
# subplot(144)
# imshow(nw.weights[3], interpolation='none', aspect=1);colorbar()









    Out[9]:





<matplotlib.colorbar.Colorbar at 0x1128eaeb8>



In [145]:

    
# train as a simple neuron
target_func = lambda x: 1 if x < 0.5 else 0
nw = Network([1, 4,  1])

figure(figsize=(4, 4))
#subplot(141)


batch_size = 10
inputs = [[x] for x in np.linspace(0, 1, batch_size)]
targets = [[target_func(x)] for x in np.linspace(0, 1, batch_size)]

n = 0
for i in range(3):
    for ind in range(40):
        n += 1;
        nw.train_batch(inputs, targets, 10)
    Plot(lambda x: nw.output([x])[0], [0, 1], label="NN {} batches".format(n))

plot([i[0] for i in inputs], [t[0] for t in targets], 'r.', label="training data")
xlim(-0.25, 1.25)
ylim(-0.25, 1.25)
_title = "Training Progress Through\nMini-batches (4 hidden neurons)"
title(_title, fontsize=15)
legend(loc=(1.2, 0.25), frameon=False)

fn = "004 batch training " + _title.replace('\n', ' ') + ".png"
savefig(fn, dpi=300, 
        bbox_inches='tight', 
               transparent=True,
               pad_inches=0)
FileLink(fn)









    Out[145]:




004 batch training Training Progress Through Mini-batches (4 hidden neurons).png

Now a sine function



In [143]:

    
# train as a simple neuron
target_func = lambda x: np.cos(x)**2
nw = Network([1, 10,  1])

figure(figsize=(16, 4))
#subplot(141)


batch_size = 100
grid = np.linspace(0, 10, batch_size)

inputs = [[x] for x in grid]
targets = [[target_func(x)] for x in grid]

n = 0
for i in range(4):
    for ind in range(500):
        n += 1;
        nw.train_batch(inputs, targets, 40)
    Plot(lambda x: nw.output([x])[0], [0, 10], label="NN {} batches".format(n))

plot([i[0] for i in inputs], [t[0] for t in targets], 'r.', label="training data")

_title = "Training Progress Through Mini-batches (10 hidden neurons)"
title(_title)
xlim(-0.25, 10.25)
ylim(-0.25, 1.25)
legend(loc=4, frameon=False)
fn = "004 batch training " + _title + ".png"
savefig(fn, dpi=300, 
        bbox_inches='tight', 
               transparent=True,
               pad_inches=0)
FileLink(fn)









    Out[143]:




004 batch training Training Progress Through Mini-batches (10 hidden neurons).png

Now an absolute function?

Well as it turned out, to encode an absolute value function is hard. You can play with the code below and try to learn it, but for less than 10 hidden neurons the result is usually pretty terrible.

It is possible however, to learn half of the absolute function, and encode only a straight line.



In [212]:

    
# train as a simple neuron
target_func = lambda x: np.abs(x - 0.5)
nw = Network([1, 20,  1])

figure(figsize=(6, 6))

batch_size = 40
grid = np.linspace(0, 0.5, batch_size)

inputs = [[x] for x in grid]
targets = [[target_func(x)] for x in grid]

n = 0
for i in range(4):
    for ind in range(1000):
        n += 1;
        nw.train_batch(inputs, targets, 23)
    Plot(lambda x: nw.output([x])[0], [0, 1.0], label="NN {} batches".format(n))

plot([i[0] for i in inputs], [t[0] for t in targets], 'r.', label="training data")

_title = "Emulate Half of An Absolute Value Function"
title(_title)
xlim(-0.25, 1.25)
ylim(-0.25, 1.25)
legend(loc=1, frameon=False)
fn = "004 batch training " + _title.replace('\n', ' ') + ".png"
savefig(fn,
        dpi=300,
        bbox_inches='tight',
        transparent=True,
        pad_inches=0)
FileLink(fn)









    Out[212]:




004 batch training Emulate Half of An Absolute Value Function.png

Now equiped with this set of hyper-parameters, I thought:

"If I can train both of the two halfs of the
*absolute function* separately, I can build
the entire function but adding these two 
half together, right?"

Then I tried 2 $\times$ of the number of hidden neurons.

And amazingly, it just worked.



In [211]:

    
# train as a simple neuron
target_func = lambda x: np.abs(x - 0.5)
nw = Network([1, 40,  1])

figure(figsize=(6, 6))

batch_size = 80
grid = np.linspace(0, 1, batch_size)

inputs = [[x] for x in grid]
targets = [[target_func(x)] for x in grid]

n = 0
for i in range(4):
    for ind in range(4000):
        n += 1;
        nw.train_batch(inputs, targets, 10)
    Plot(lambda x: nw.output([x])[0], [0, 1.0], label="NN {} batches".format(n))

plot([i[0] for i in inputs], [t[0] for t in targets], 'r.', label="training data")

_title = "Emulate An Absolute\nFunction (2 times of hidden neurons)"
title(_title)
xlim(-0.25, 1.25)
ylim(-0.25, 1.25)
legend(loc=1, frameon=False)
fn = "004 batch training " + _title.replace('\n', ' ') + ".png"
savefig(fn,
        dpi=300,
        bbox_inches='tight',
        transparent=True,
        pad_inches=0)
FileLink(fn)









    Out[211]:




004 batch training Emulate An Absolute Function (2 times of hidden neurons).png



In [ ]:



In [ ]: